In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# CROSS-VALIDATION
# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install statsmodels;
! pip install scikit-learn;
! pip install ISLP;
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from ISLP import load_data
In [17]:
# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')
In [19]:
# 1. Validation Set Approach
# Select predictor and response variables
X = Auto[['weight', 'horsepower', 'acceleration']].values
y = Auto['mpg'].values
# Set a random seed and take a random subsample (size 250)
np.random.seed(42)
n = len(y)
Z = np.random.choice(n, 250, replace=False) # Random subsample
# Fit linear regression using training data
reg = LinearRegression()
reg.fit(X[Z], y[Z])
# Predict on the testing data (those not in Z)
test_indices = np.setdiff1d(np.arange(n), Z) # Get indices that are not in Z
mpg_predicted = reg.predict(X[test_indices])
# Calculate MSE
mse_validation = mean_squared_error(y[test_indices], mpg_predicted)
print(f"Validation MSE: {mse_validation}")
Validation MSE: 14.044255907353062
In [29]:
Yhat = np.zeros(n)
for i in range(n):
X_train = np.delete(X, i, axis=0)
y_train = np.delete(y, i, axis=0)
reg = LinearRegression()
reg.fit(X_train, y_train)
Yhat[i] = reg.predict(X[i].reshape(1, -1))[0]
In [25]:
# 2. Jackknife (Leave-One-Out Cross-Validation, LOOCV)
# Manual LOOCV
Yhat = np.zeros(n)
for i in range(n):
X_train = np.delete(X, i, axis=0)
y_train = np.delete(y, i, axis=0)
reg = LinearRegression()
reg.fit(X_train, y_train)
Yhat[i] = reg.predict(X[i].reshape(1, -1))[0]
# Calculate MSE for LOOCV
mse_loocv_manual = mean_squared_error(y, Yhat)
print(f"LOOCV MSE (manual): {mse_loocv_manual}")
# Using cross_val_score from sklearn for LOOCV
loo = LeaveOneOut()
reg = LinearRegression()
mse_loocv_sklearn = -np.mean(cross_val_score(reg, X, y, cv=loo, scoring='neg_mean_squared_error'))
print(f"LOOCV MSE (sklearn): {mse_loocv_sklearn}")
LOOCV MSE (manual): 18.25595080469144 LOOCV MSE (sklearn): 18.255950804691444
In [27]:
# 3. K-Fold Cross-Validation
kf = KFold(n_splits=30, shuffle=True, random_state=42)
cv_error = []
# Polynomial regression and cross-validation
for p in range(1, 11):
poly = PolynomialFeatures(degree=p)
X_poly = poly.fit_transform(Auto[['weight', 'horsepower', 'acceleration']])
reg = LinearRegression()
mse_kfold = -np.mean(cross_val_score(reg, X_poly, y, cv=kf, scoring='neg_mean_squared_error'))
cv_error.append(mse_kfold)
# Display errors for each degree
print(cv_error)
# Find the degree with the minimum error
min_error_degree = np.argmin(cv_error) + 1 # +1 because Python index starts from 0
print(f"Degree with minimum error: {min_error_degree}")
[18.27445365338935, 15.968997263479562, 19.66681010976254, 20.537052894764713, 37.131761204331504, 29.717233687052477, 2602.9910843513617, 2402.2450938767124, 323.7778722471489, 335.79439151783146] Degree with minimum error: 2